--- layout: notebook title: Blog Post 1 ---
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import sqlite3
from plotly import express as px
from sklearn.linear_model import LinearRegression
import calendar
conn = sqlite3.connect("temps.db")
df_iter = pd.read_csv("temps.csv", chunksize = 100000)
df = df_iter.__next__()
df.head()
| ID | Year | VALUE1 | VALUE2 | VALUE3 | VALUE4 | VALUE5 | VALUE6 | VALUE7 | VALUE8 | VALUE9 | VALUE10 | VALUE11 | VALUE12 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ACW00011604 | 1961 | -89.0 | 236.0 | 472.0 | 773.0 | 1128.0 | 1599.0 | 1570.0 | 1481.0 | 1413.0 | 1174.0 | 510.0 | -39.0 |
| 1 | ACW00011604 | 1962 | 113.0 | 85.0 | -154.0 | 635.0 | 908.0 | 1381.0 | 1510.0 | 1393.0 | 1163.0 | 994.0 | 323.0 | -126.0 |
| 2 | ACW00011604 | 1963 | -713.0 | -553.0 | -99.0 | 541.0 | 1224.0 | 1627.0 | 1620.0 | 1596.0 | 1332.0 | 940.0 | 566.0 | -108.0 |
| 3 | ACW00011604 | 1964 | 62.0 | -85.0 | 55.0 | 738.0 | 1219.0 | 1442.0 | 1506.0 | 1557.0 | 1221.0 | 788.0 | 546.0 | 112.0 |
| 4 | ACW00011604 | 1965 | 44.0 | -105.0 | 38.0 | 590.0 | 987.0 | 1500.0 | 1487.0 | 1477.0 | 1377.0 | 974.0 | 31.0 | -178.0 |
def prepare_df(df):
df = df.set_index(keys=["ID", "Year"])
df = df.stack()
df = df.reset_index()
df = df.rename(columns = {"level_2" : "Month" , 0 : "Temp"})
df["Month"] = df["Month"].str[5:].astype(int)
df["Temp"] = df["Temp"] / 100
return(df)
df = prepare_df(df)
df.head()
| ID | Year | Month | Temp | |
|---|---|---|---|---|
| 0 | ACW00011604 | 1961 | 1 | -0.89 |
| 1 | ACW00011604 | 1961 | 2 | 2.36 |
| 2 | ACW00011604 | 1961 | 3 | 4.72 |
| 3 | ACW00011604 | 1961 | 4 | 7.73 |
| 4 | ACW00011604 | 1961 | 5 | 11.28 |
df_iter = pd.read_csv("temps.csv", chunksize = 100000)
for df in df_iter:
df = prepare_df(df)
df.to_sql("temperatures", conn, if_exists = "append", index = False)
df.head()
| ID | Year | Month | Temp | |
|---|---|---|---|---|
| 0 | USW00014924 | 2016 | 1 | -13.69 |
| 1 | USW00014924 | 2016 | 2 | -8.40 |
| 2 | USW00014924 | 2016 | 3 | -0.20 |
| 3 | USW00014924 | 2016 | 4 | 3.21 |
| 4 | USW00014924 | 2016 | 5 | 13.85 |
url = "https://raw.githubusercontent.com/PhilChodrow/PIC16B/master/datasets/noaa-ghcn/station-metadata.csv"
stations = pd.read_csv(url)
stations["FIPS 10-4"] = stations["ID"].str[0:2]
stations.to_sql("stations", conn, if_exists = "replace", index = False)
C:\Users\Matthew\anaconda3\envs\PIC16B\lib\site-packages\pandas\core\generic.py:2872: UserWarning: The spaces in these column names will not be changed. In pandas versions < 0.14, spaces were converted to underscores. sql.to_sql(
stations.head()
| ID | LATITUDE | LONGITUDE | STNELEV | NAME | FIPS 10-4 | |
|---|---|---|---|---|---|---|
| 0 | ACW00011604 | 57.7667 | 11.8667 | 18.0 | SAVE | AC |
| 1 | AE000041196 | 25.3330 | 55.5170 | 34.0 | SHARJAH_INTER_AIRP | AE |
| 2 | AEM00041184 | 25.6170 | 55.9330 | 31.0 | RAS_AL_KHAIMAH_INTE | AE |
| 3 | AEM00041194 | 25.2550 | 55.3640 | 10.4 | DUBAI_INTL | AE |
| 4 | AEM00041216 | 24.4300 | 54.4700 | 3.0 | ABU_DHABI_BATEEN_AIR | AE |
countries_url = "https://raw.githubusercontent.com/mysociety/gaze/master/data/fips-10-4-to-iso-country-codes.csv"
countries = pd.read_csv(countries_url)
countries.to_sql("countries", conn, if_exists = "replace", index = False)
C:\Users\Matthew\anaconda3\envs\PIC16B\lib\site-packages\pandas\core\generic.py:2872: UserWarning: The spaces in these column names will not be changed. In pandas versions < 0.14, spaces were converted to underscores. sql.to_sql(
countries.head()
| FIPS 10-4 | ISO 3166 | Name | |
|---|---|---|---|
| 0 | AF | AF | Afghanistan |
| 1 | AX | - | Akrotiri |
| 2 | AL | AL | Albania |
| 3 | AG | DZ | Algeria |
| 4 | AQ | AS | American Samoa |
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
print(cursor.fetchall())
[('temperatures',), ('stations',), ('countries',)]
conn.close()
def query_climate_database(country, year_begin, year_end, month):
"""
docstring
"""
cmd = \
"""
SELECT S.NAME, S.LATITUDE, S.LONGITUDE, C.Name AS Country, T.Year, T.Month, T.Temp
FROM stations S
LEFT JOIN countries C ON S."FIPS 10-4" = C."FIPS 10-4"
LEFT JOIN temperatures T ON S.ID = T.ID
"""
cmd += " WHERE C.Name = \"" + country + "\""
cmd += " AND T.Year >= " + str(year_begin) + " AND T.Year <= " + str(year_end)
cmd += " AND T.Month = " + str(month)
return pd.read_sql_query(cmd, conn)
conn = sqlite3.connect("temps.db")
df = query_climate_database(country = "India", year_begin = 1980, year_end = 2020, month = 1)
conn.close()
df
| NAME | LATITUDE | LONGITUDE | Country | Year | Month | Temp | |
|---|---|---|---|---|---|---|---|
| 0 | PBO_ANANTAPUR | 14.583 | 77.633 | India | 1980 | 1 | 23.48 |
| 1 | PBO_ANANTAPUR | 14.583 | 77.633 | India | 1980 | 1 | 23.48 |
| 2 | PBO_ANANTAPUR | 14.583 | 77.633 | India | 1981 | 1 | 24.57 |
| 3 | PBO_ANANTAPUR | 14.583 | 77.633 | India | 1981 | 1 | 24.57 |
| 4 | PBO_ANANTAPUR | 14.583 | 77.633 | India | 1982 | 1 | 24.19 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 6299 | DARJEELING | 27.050 | 88.270 | India | 1994 | 1 | 8.10 |
| 6300 | DARJEELING | 27.050 | 88.270 | India | 1995 | 1 | 5.60 |
| 6301 | DARJEELING | 27.050 | 88.270 | India | 1995 | 1 | 5.60 |
| 6302 | DARJEELING | 27.050 | 88.270 | India | 1997 | 1 | 5.70 |
| 6303 | DARJEELING | 27.050 | 88.270 | India | 1997 | 1 | 5.70 |
6304 rows × 7 columns
def coef(data_group):
"""
docstring
"""
x = data_group[["Year"]]
y = data_group["Temp"]
LR = LinearRegression()
LR.fit(x, y)
return LR.coef_[0]
def int_to_month(i):
month_dict = {1 : "January",
2 : "February",
3 : "March",
4 : "April"
}
type(calendar.month_name[1])
str
def temperature_coefficient_plot(country, year_begin, year_end, month, min_obs, **kwargs):
"""
"""
df = query_climate_database(country, year_begin, year_end, month)
#counts = (pd.DataFrame(df.groupby('NAME').size())).reset_index()
counts = (df.groupby('NAME').size()).reset_index()
counts = counts[counts[0] > min_obs]
df = df.merge(counts, how = 'inner', on = 'NAME')
coefs = (df.groupby(["NAME", "Month"]).apply(coef)).reset_index()
df = df.merge(coefs, how = 'left', on = ['NAME', 'Month'])
# df = df.rename({"0_x" : "Counts", "0_y" : "TempChange"})
# print(df)
title = "Estimates of yearly increase in " + calendar.month_name[month] + " for stations in " + country + ", years " + str(year_begin) + " - " + str(year_end)
fig = px.scatter_mapbox(df,
lat = "LATITUDE",
lon = "LONGITUDE",
hover_name = "NAME",
height = 300,
color = '0_y',
color_continuous_midpoint = 0,
title = title,
hover_data = {"0_y" : ':.4f'},
**kwargs)
fig.update_layout(mapbox_style="carto-positron")
fig.update_layout(margin={"r":0,"t":40,"l":0,"b":0})
fig.update_layout()
fig.show()
conn = sqlite3.connect("temps.db")
color_map = px.colors.diverging.RdGy_r
temperature_coefficient_plot("India", 1980, 2020, 1, 10,
zoom = 2,
mapbox_style = "carto-positron",
color_continuous_scale = color_map)
conn.close()